In this document we explore the METABRIC dataset.

Basic information

print(paste('Number of samples: ', length(unique(cells$ImageNumber))))
## [1] "Number of samples:  794"
print(paste('Number of cell types: ', length(unique(cells$meta_description))))
## [1] "Number of cell types:  32"
paste('Cell types:')
## [1] "Cell types:"
print(unique(cells$meta_description))
##  [1] "CK^{med}ER^{lo}"            "ER^{hi}CXCL12^{+}"         
##  [3] "CD4^{+} T cells & APCs"     "CD4^{+} T cells"           
##  [5] "Endothelial"                "Fibroblasts"               
##  [7] "Myofibroblasts PDPN^{+}"    "CD8^{+} T cells"           
##  [9] "CK8-18^{hi}CXCL12^{hi}"     "Myofibroblasts"            
## [11] "CK^{lo}ER^{lo}"             "Macrophages"               
## [13] "CK^{+} CXCL12^{+}"          "CK8-18^{hi}ER^{lo}"        
## [15] "CK8-18^{+} ER^{hi}"         "CD15^{+}"                  
## [17] "MHC I & II^{hi}"            "T_{Reg} & T_{Ex}"          
## [19] "CD57^{+}"                   "Ep Ki67^{+}"               
## [21] "CK^{lo}ER^{med}"            "Macrophages & granulocytes"
## [23] "CD38^{+} lymphocytes"       "Ki67^{+}"                  
## [25] "HER2^{+}"                   "B cells"                   
## [27] "Basal"                      "Fibroblasts FSP1^{+}"      
## [29] "Granulocytes"               "MHC I^{hi}CD57^{+}"        
## [31] "Ep CD57^{+}"                "MHC^{hi}CD15^{+}"
ggplot(data= cells %>% count(ImageNumber)) + geom_bar(aes(y=n, x=ImageNumber), stat='identity') + theme_bw() + ggtitle('Total counts per sample')

if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'totalcounts.pdf', sep=''))
}
## Saving 7 x 5 in image
ggplot(data=cells %>% group_by(ImageNumber) %>% count(meta_description)) + geom_boxplot(aes(x=meta_description, y=n)) + theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('Cell type counts per sample') + ylim(0,100)
## Warning: Removed 2783 rows containing non-finite values (`stat_boxplot()`).

if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'samplecounts.pdf', sep=''))
}
## Saving 7 x 5 in image
## Warning: Removed 2783 rows containing non-finite values (`stat_boxplot()`).
cell_occurences <- merge(cell_counts %>% expand(tnumber, meta_description), cell_counts, by = c('tnumber','meta_description'), all.x = TRUE) %>%
                  mutate(n = coalesce(n, 0)) %>%
                  mutate(category = ifelse(n==0,'0', ifelse(n<10, '0<10', ifelse(n<50,'10<50','>50')))) %>% group_by(meta_description) %>%
                  count(category)

lvls <- cell_occurences %>% filter(category == '0') %>% arrange(desc(n)) %>% pull(meta_description)

#Haakje: Zijn de samples met hoge counts in cell types met veel 0 counts, verrijkt in bepaaplde subtypes of iets dergelijks?
ggplot(cell_occurences, aes(x=factor(meta_description, level = lvls), y=n, fill=factor(category, levels=c('>50','10<50','0<10','0')))) + 
    geom_bar(position="fill", stat="identity") + theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  ggtitle('Cell type counts in samples') + ylab('proportion') + xlab('')+ guides(fill=guide_legend(title="occurences"))

if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'celltypecounts.pdf', sep=''))
}
## Saving 7 x 5 in image

Images from the METABRIC cohort are small compared to the NABUCCO data and therefore have a lower cell count.

nabucco_data <- read_tsv(here('DATA/nabucco_1_densities.tsv'))
## Rows: 382 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): tnumber, phenotype, assigned_loc
## dbl (3): n, area, density
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(data= nabucco_data %>% group_by(tnumber) %>% count(n)) + geom_bar(aes(y=n, x=tnumber), stat='identity') + theme_bw() + ggtitle('Total counts per sample in NABUCCO cohort')  + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.

if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'NABUCCO_totalcounts.pdf', sep=''))
}
## Saving 7 x 5 in image
ggplot(data=nabucco_data) + geom_boxplot(aes(x=phenotype, y=n)) + theme_bw() + ylim(0,1000) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('Cell type counts per sample in NABUCCO cohort')
## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).

if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'NABUCCO_samplecounts.pdf', sep=''))
}
## Saving 7 x 5 in image
## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).
NABUCCO_cell_occurences <- merge(nabucco_data %>% expand(tnumber, phenotype), nabucco_data, by = c('tnumber','phenotype'), all.x = TRUE) %>%
                  mutate(n = coalesce(n, 0)) %>%
                  mutate(category = ifelse(n==0,'0', ifelse(n<10, '0<10', ifelse(n<50,'10<50','>50')))) %>% group_by(phenotype) %>%
                  count(category)

lvls <- NABUCCO_cell_occurences %>% filter(category == '>50') %>% arrange(n) %>% pull(phenotype)

ggplot(NABUCCO_cell_occurences, aes(x=factor(phenotype, level = lvls), y=n, fill=factor(category, levels=c('>50','10<50','0<10','0')))) + 
    geom_bar(position="fill", stat="identity") + theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  ggtitle('Cell type counts in samples in NABUCCO cohort') + ylab('proportion') + xlab('')+ guides(fill=guide_legend(title="occurences"))

if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'NABUCCO_celltypecounts.pdf', sep=''))
}
## Saving 7 x 5 in image

Cell type occurences

In future steps we are going to estimate distances between cell type combinations. Not every combination is present in every sample however. We have a large sample size and this possibly compensates for the low cell type count. Cell types that are too rare cannot be considered.

cell_occurences_combinations <- getCombinationCounts()

print(paste('number of samples that cannot be estimated: ', nrow(cell_occurences_combinations %>% filter(n_from == 0 | n_to ==0))))
## [1] "number of samples that cannot be estimated:  498295"
heatmap_occurences <- function(dataset, occ_number){
  cell_occurences_categories <- dataset %>% mutate(category = ifelse(n_to > occ_number & n_from> occ_number, paste(as.character(occ_number),'+',sep=''), as.character(occ_number))) %>%
                                                                       group_by(phenotype_from, phenotype_to) %>%
                                                                                      count(category) %>% mutate(n = n/792)
  
  df <- cell_occurences_categories %>% filter(category == paste(as.character(occ_number),'+',sep=''))

  trans_df <- xtabs(n ~ phenotype_from + phenotype_to, df)
  hm <- Heatmap(trans_df, cluster_rows = T, cluster_columns = T, row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
                name = paste('sample percentage \n with', paste(as.character(occ_number),'+',sep=''), 'occ.'), column_title = 'phenotype_to', row_title = 'phenotype_from')
  
  print(hm)
  
  if (params$save_files){
  save_pdf(hm, paste(here('output/Data_exploration/'), occ_number, 'plus_occurences_heatmap.pdf', sep=''), width=6, height=4)
}
}

heatmap_occurences(cell_occurences_combinations, 0)

heatmap_occurences(cell_occurences_combinations, 5)

heatmap_occurences(cell_occurences_combinations, 10)

heatmap_occurences(cell_occurences_combinations, 20)

Look in each subtype.

cell_occurences_combinations <- getCombinationCounts()

heatmap_occurences <- function(dataset, occ_number, subtype){
  cell_occurences_categories <- dataset %>% mutate(category = ifelse(n_to > occ_number & n_from> occ_number, paste(as.character(occ_number),'+',sep=''), as.character(occ_number))) %>%
                                                                       group_by(phenotype_from, phenotype_to) %>%
                                                                                      count(category) %>% 
                                                                      mutate(n = n/ length(unique(dataset %>% pull(tnumber))))
  
  df <- cell_occurences_categories %>% filter(category == paste(as.character(occ_number),'+',sep=''))

  trans_df <- xtabs(n ~ phenotype_from + phenotype_to, df)
  hm <- Heatmap(trans_df, cluster_rows = F, cluster_columns = F, row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
                name = paste('sample percentage \n of type', subtype, '\n with', paste(as.character(occ_number),'+',sep=''), 'occ.'), column_title = 'phenotype_to', row_title = 'phenotype_from')
  
  print(hm)
}

heatmap_occurences(cell_occurences_combinations, 0,'ALL')

subtypes = group_split(clinical_data %>% group_by(PAM50))
for (s in 1:length(subtypes)){
  subtype = subtypes[[s]]
  heatmap_occurences(cell_occurences_combinations %>% filter(tnumber %in% subtype$ImageNumber), 0,unique(subtype$PAM50))
}

If we remove combinations in samples with a threshold count, the number of samples decreases fast.

thresholds <- tibble(threshold = c(0:200))
                     
compute_from <- function(t){
   return(nrow(cell_occurences_combinations %>% filter(n_to >= t)))
}

compute_both <- function(t){
   return(nrow(cell_occurences_combinations %>% filter(n_to >= t & n_from >= t)))
}

n_from <- apply(thresholds,1, compute_from)
n_both <- apply(thresholds,1, compute_both)

thresholds <- cbind(thresholds, n_from, n_both)

ggplot(data=thresholds) + geom_point(aes(x=threshold, y=n_from, color = 'phenotype_from OR phenotype_to'))  + 
  geom_point(aes(x=threshold, y=n_both, color='phenotype_from AND phenotype_to')) + theme_bw() + ggtitle('Number of samples with at least x cells') + ylab('count') + xlab('n cells') + xlim(0,50)
## Warning: Removed 150 rows containing missing values (`geom_point()`).
## Removed 150 rows containing missing values (`geom_point()`).

if (params$save_files){
ggsave(here('output/Data_exploration/occurence_slope.pdf'))
}
## Saving 7 x 5 in image
## Warning: Removed 150 rows containing missing values (`geom_point()`).
## Removed 150 rows containing missing values (`geom_point()`).

Clinical data

Look at the distribution of breast cancer subtypes.

clinical_data <- read_csv(here('DATA/IMCClinical.csv'))
## Rows: 709 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): metabric_id, ERStatus, LymphNodesOrdinal, sizeOrdinal, PAM50, IntClust
## dbl (3): Grade, yearsToStatus, DeathBreast
## lgl (2): ERBB2_pos, isValidation
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinical_data_ext <- read_tsv(here('DATA/brca_metabric_clinical_data.tsv'))
## Rows: 2509 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): Study ID, Patient ID, Sample ID, Type of Breast Surgery, Cancer Ty...
## dbl (12): Age at Diagnosis, Cohort, Neoplasm Histologic Grade, Lymph nodes e...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinical_data_Rueda <- read.delim(here('DATA/41586_2019_1007_MOESM8_ESM.txt')) #https://www-nature-com.tudelft.idm.oclc.org/articles/s41586-019-1007-8#MOESM7

contingency_table <- function(feature1, feature2){
  clinical_data <- merge(clinical_data %>% select(c('metabric_id', feature1)),
                         clinical_data_ext %>% select(c('Sample ID', feature2)), by.x='metabric_id', by.y='Sample ID',
                         all.x=T)

table(clinical_data[,2], clinical_data[,3])
  
}

order <- tibble(intclust = c("IntClust 1", "IntClust 2",  "IntClust 3",  "IntClust 4-", "IntClust 4+", "IntClust 5-", "IntClust 5+", "IntClust 6",  "IntClust 7", "IntClust 8", "IntClust 9",  "IntClust 10"))

# Contingency tables
contingency_table('PAM50',"Pam50 + Claudin-low subtype")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(feature1)
## 
##   # Now:
##   data %>% select(all_of(feature1))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(feature2)
## 
##   # Now:
##   data %>% select(all_of(feature2))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
##              
##               Basal claudin-low Her2 LumA LumB Normal
##   Basal          43          45    0    0    0      0
##   HER2            0           3   58    0    0      0
##   Luminal A       0          11    0  217    0      0
##   Luminal B       0           5    0    0  156      0
##   Normal-like     0          27    0    0    0     46
contingency_table('PAM50',"3-Gene classifier subtype" )
##              
##               ER-/HER2- ER+/HER2- High Prolif ER+/HER2- Low Prolif HER2+
##   Basal              67                     2                    1     4
##   HER2                8                    14                    1    33
##   Luminal A           1                    53                  156     7
##   Luminal B           0                   113                   11    16
##   Normal-like        14                     8                   40     4
contingency_table('IntClust',"Integrative Cluster" )
##              
##                1 10  2  3 4ER- 4ER+  5  6  7  8  9
##   IntClust 1  49  0  0  0    0    0  0  0  0  0  0
##   IntClust 10  0 61  0  0    0    0  0  0  0  0  0
##   IntClust 2   0  0 29  0    0    0  0  0  0  0  0
##   IntClust 3   0  0  0 84    0    0  0  0  0  0  0
##   IntClust 4-  0  0  0  0   24   13  0  0  0  0  0
##   IntClust 4+  0  0  0  0    2   73  0  0  0  0  0
##   IntClust 5-  0  0  0  0    0    0 32  0  0  0  0
##   IntClust 5+  0  0  0  0    0    0 32  0  0  0  0
##   IntClust 6   0  0  0  0    0    0  0 28  0  0  0
##   IntClust 7   0  0  0  0    0    0  0  0 60  0  0
##   IntClust 8   0  0  0  0    0    0  0  0  0 80  0
##   IntClust 9   0  0  0  0    0    0  0  0  0  0 44
contingency_table('ERStatus',"ER Status" )
##      
##       Negative Positive
##   neg      104       18
##   pos       28      452
contingency_table('ERStatus',"ER status measured by IHC" )
##      
##       Negative Positve
##   neg      122       0
##   pos        0     480
contingency_table_withindata <- function(feature1, feature2){
  clinical_data <- merge(clinical_data %>% select(c('metabric_id', feature1)),
                         clinical_data %>% select(c('metabric_id', feature2)), by.x='metabric_id', by.y='metabric_id',
                         all.x=T)
table(clinical_data[,2], clinical_data[,3])
  
}
contingency_table_withindata('PAM50',"ERStatus" )
##              
##               neg pos
##   Basal        64  22
##   HER2         39  21
##   Luminal A     5 220
##   Luminal B     5 154
##   Normal-like   9  63
contingency_table_Rueda <- function(feature1, feature2){
  clinical_data <- merge(clinical_data %>% select(c('metabric_id', feature1)),
                         clinical_data_Rueda %>% select(c('METABRIC.ID', feature2)), by.x='metabric_id', by.y='METABRIC.ID',
                         all.x=T)
table(clinical_data[,2], clinical_data[,3])
  
}

contingency_table_Rueda('PAM50', "Pam50Subtype" )
##              
##               Basal Her2 LumA LumB Normal
##   Basal          88    0    0    0      0
##   HER2            0   61    0    0      0
##   Luminal A       0    0  228    0      0
##   Luminal B       0    0    0  161      0
##   Normal-like     0    0    0    0     73
contingency_table_Rueda('IntClust',"iC10" )
##              
##                1 10  2  3 4ER- 4ER+  5  6  7  8  9
##   IntClust 1  49  0  0  0    0    0  0  0  0  0  0
##   IntClust 10  0 61  0  0    0    0  0  0  0  0  0
##   IntClust 2   0  0 29  0    0    0  0  0  0  0  0
##   IntClust 3   0  0  0 84    0    0  0  0  0  0  0
##   IntClust 4-  0  0  0  0   24   13  0  0  0  0  0
##   IntClust 4+  0  0  0  0    2   73  0  0  0  0  0
##   IntClust 5-  0  0  0  0    0    0 32  0  0  0  0
##   IntClust 5+  0  0  0  0    0    0 32  0  0  0  0
##   IntClust 6   0  0  0  0    0    0  0 28  0  0  0
##   IntClust 7   0  0  0  0    0    0  0  0 60  0  0
##   IntClust 8   0  0  0  0    0    0  0  0  0 80  0
##   IntClust 9   0  0  0  0    0    0  0  0  0  0 44
contingency_table_Rueda('ERStatus',"ER.Expr")
##      
##         -   +
##   neg 104  18
##   pos  28 452
contingency_table_Rueda('ERStatus',"ER.Status")
##      
##       neg pos
##   neg 122   0
##   pos   0 480
# Intclust composition
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"PR Status" )),by.x='intclust',by.y='row.names') %>% rename('PR_positive' = 'Positive', 'PR_negative' = 'Negative')
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"ER Status" )),by.x='intclust',by.y='row.names') %>% rename('ER_positive' = 'Positive', 'ER_negative' = 'Negative')
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"HER2 Status" )),by.x='intclust',by.y='row.names') %>% rename('HER2_positive' = 'Positive', 'HER2_negative' = 'Negative')
melt_order <- reshape2::melt(order, id=c('intclust'))

ggplot(data=melt_order %>% filter(variable=='HER2_positive' | variable=='HER2_negative')) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('HER2 counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`

ggplot(data=melt_order %>% filter(variable=='ER_positive' | variable=='ER_negative')) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('ER counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`

ggplot(data=melt_order %>% filter(variable=='PR_positive' | variable=='PR_negative')) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('PR counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`

order <- tibble(intclust = c("IntClust 1", "IntClust 2",  "IntClust 3",  "IntClust 4-", "IntClust 4+", "IntClust 5-", "IntClust 5+", "IntClust 6",  "IntClust 7", "IntClust 8", "IntClust 9",  "IntClust 10"))

order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"Pam50 + Claudin-low subtype" )),by.x='intclust',by.y='row.names')
melt_order <- reshape2::melt(order, id=c('intclust'))
ggplot(data=melt_order) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw()  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('PAM50 + claudin-low counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`

# subtypes in data
ggplot(data = clinical_data) + geom_bar(aes(x=PAM50)) + theme_bw() + ggtitle('breast cancer subtype occurences')

if (params$save_files){
ggsave(here('output/Data_exploration/subtypes.pdf'))
}
## Saving 7 x 5 in image

Slides

Look at sampled images.

TME =  c("B cells","CD38^{+} lymphocytes","CD4^{+} T cells","CD4^{+} T cells & APCs","CD57^{+}","CD8^{+} T cells","Endothelial","Fibroblasts","Fibroblasts FSP1^{+}","Granulocytes", "Ki67^{+}","Macrophages","Macrophages & granulocytes", "Myofibroblasts","Myofibroblasts PDPN^{+}", "T_{Reg} & T_{Ex}")
tumor = setdiff(unique(cells %>% pull(meta_description)), TME)

samples <- sample(unique(cells$ImageNumber),5)

for (s in samples){
  print(show_slide(s, tumor))
}
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.

i = 1
print(show_slide(i,tumor))
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.

ggsave(paste('output/Slides/exploreslide_',i,'.pdf',sep=''), width = 15, height=10)

Expression data

Cell types are classified based on expression profiles.

library(ComplexHeatmap)
expression_profiles_summary <- as.data.frame(cells %>% group_by(meta_description)  %>% 
  summarise_at(vars("Histone H3":"DNA2"), mean))
rownames(expression_profiles_summary) <- expression_profiles_summary$meta_description
expression_profiles_summary <- scale(subset(expression_profiles_summary, select=-c(meta_description)))
  
hm <- Heatmap(expression_profiles_summary, name = 'mean expression', column_title = 'protein', row_title = 'cell type',
        row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6), cluster_rows = T, cluster_columns =  T)

print(hm)

if (params$save_files){
save_pdf(hm, here('output/Data_exploration/expression_profiles.pdf'), width=6, height=4)
}